Loading in necessary packages

library(DESeq2)
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.3.2
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.2
## Loading required package: SummarizedExperiment
## Warning: package 'SummarizedExperiment' was built under R version 4.3.2
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(ggplot2)
library(genefilter)
## 
## Attaching package: 'genefilter'
## The following objects are masked from 'package:MatrixGenerics':
## 
##     rowSds, rowVars
## The following objects are masked from 'package:matrixStats':
## 
##     rowSds, rowVars
library(GenomicFeatures)
## Warning: package 'GenomicFeatures' was built under R version 4.3.2
## Loading required package: AnnotationDbi
library(biomaRt)
## Warning: package 'biomaRt' was built under R version 4.3.2
library(knitr)
library(reshape2)
library(scales)
library(Biostrings)
## Warning: package 'Biostrings' was built under R version 4.3.2
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
library(kableExtra)
library(pheatmap)
library(org.Mm.eg.db)
## 
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.3.2
## 
## clusterProfiler v4.10.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
library("RColorBrewer")

Creating the Input + Gene annotations

#read in the ensembl.genes 

mm.gtf.db <- makeTxDbFromGFF("annotation/Mus_musculus.GRCm38.102.chr.gtf", format="gtf" )
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK
ensembl.genes = genes(mm.gtf.db)

mouse = useEnsembl(biomart="ENSEMBL_MART_ENSEMBL", dataset="mmusculus_gene_ensembl", version = "102")

bm.annotations = getBM(attributes=c("ensembl_gene_id", "entrezgene_id", "gene_biotype", "mgi_symbol", "description", "external_gene_name"), mart=mouse, filters="ensembl_gene_id", values=ensembl.genes$gene_id, uniqueRows=TRUE)

ensembl.genes$external_gene_name = bm.annotations$external_gene_name[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]

ensembl.genes$gene_biotype = bm.annotations$gene_biotype[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]

ensembl.genes$mgi_symbol = bm.annotations$mgi_symbol[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]

ensembl.genes$description = bm.annotations$description[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]

ensembl.genes$entrezgene_id = bm.annotations$entrezgene_id[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]

Creating dataset from the Experimental metadata

#read in the experimental metadata 
experimental_metadata = read.delim("experimental_metadata.txt", sep="\t")

#deleting the row that has KPC cachectic replicate 2 

#create matrix of data 
data = matrix(0, ncol=length(experimental_metadata$name), nrow=55401)
colnames(data)= experimental_metadata$name
for( i in experimental_metadata$name){
  data[,i] = read.table(paste("data/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$expected_count
}
row.names(data) = read.table(paste("data/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$gene_id

#Create a factor for the condition column (Cachectic vs. Control) - by making it a factor you give it an order
experimental_metadata$condition = factor(experimental_metadata$condition, levels=c("control", "cachectic"))

#Create a factor for the replicate column - by making it a factor you give it an order
experimental_metadata$replicate = factor(experimental_metadata$replicate, levels=c("1", "2"))

#Create a factor for the cell type column - by making it a factor you give it an order
experimental_metadata$cell_line = factor(experimental_metadata$cell_line, levels = c("KPP", "KPC", "C26", "LLC"))

#Preparing for DESeq - converting everything to be integers 
data_mat = apply(round(data), c(1,2), as.integer)
data_mat = subset(data_mat, select= -8) #removing the 8th column that consists of KPC cachectic rep
experimental_metadata = experimental_metadata[-8,]
data_mat = data_mat[!(row.names(data_mat) %in% ensembl.genes$gene_id[ensembl.genes$gene_biotype %in% c("rRNA", "snoRNA", "snRNA", "Mt_rRNA")]),]
data_mat = data_mat[rowSums(data_mat) > 0,]
data_mat = data_mat[ !(row.names(data_mat) %in% ensembl.genes[seqnames(ensembl.genes) == "chrM"]$gene_id), ]
dds = DESeqDataSetFromMatrix(data_mat, experimental_metadata, ~ cell_line + condition)

dds <- estimateSizeFactors(dds) 
dds <- estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
rld <- rlog(dds)
sampleDists <- dist(t(assay(rld)))
plot(hclust(sampleDists))

Correlation Heatmap

annot = dplyr::select(experimental_metadata, c(cell_line, condition))
row.names(annot) = experimental_metadata$name
rld %>%
  assay() %>%
  cor() %>%
  pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
           annotation = annot,
           annotation_colors = list(
                               condition = c(control = "blue", cachectic= "red"),
                               cell_line = c(C26 = "#BF3100", KPC = "#E9B44C", 
                                             KPP = "#1B998B", LLC = "#5D576B")),
           cluster_rows = TRUE,
           cluster_cols = T,
           cellwidth = 13,
           cellheight = 13)

PCA

ntop = 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select,]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)

pca_data <- plotPCA(rld, intgroup = c("cell_line", "condition"), returnData=TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=condition, shape=cell_line)) + geom_point(size=3) +
  scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance"), limits=c(-20, 30)) +
  scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_classic() + geom_text(data = pca_data, aes(PC1,PC2, label = name), hjust = 1.2)

PC1 is really messing up with C26 - represents differences in mouse strain, so there are definitely batch/technical artefacts in this dataset - lets use SVA to handle this

library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:Biostrings':
## 
##     collapse
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: BiocParallel
dat <- counts(dds, normalized= TRUE)
idx  <- rowMeans(dat) > 1
dat  <- dat[idx, ]
mod  <- model.matrix(~ cell_line + condition, colData(dds))
mod0 <- model.matrix(~cell_line , colData(dds)) #
svseq <- svaseq(dat, mod, mod0, n.sv = 2) # allowing n.sv does not enable the linear model to converge
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + cell_line + condition
rld <- rlog(ddssva)

Plotting PCA of SV1 - to analyse if this is the hidden effect we are looking for

(pca_data <- plotPCA(rld, intgroup = c("SV1", "SV2"), returnData=TRUE))
## using ntop=500 top features by variance
##                           PC1        PC2
## KPP_control_rep1   -8.5659692   4.668301
## KPP_control_rep2   -8.5874817   4.199114
## KPP_cachectic_rep1 -4.1120988 -10.507347
## KPP_cachectic_rep2 -6.0756795  -8.722633
## KPC_control_rep1   -8.2169924   6.852097
## KPC_control_rep2   -8.7191768   6.346721
## KPC_cachectic_rep1 -0.8345102 -10.301588
## C26_control_rep1   13.2829266  14.087862
## C26_control_rep2   14.0758737  13.876702
## C26_cachectic_rep1 21.3412882  -4.676311
## C26_cachectic_rep2 23.0152655  -6.570683
## LLC_control_rep1   -9.3395282   6.940298
## LLC_control_rep2   -9.1146540   6.301143
## LLC_cachectic_rep1 -1.9456672 -15.460528
## LLC_cachectic_rep2 -6.2035960  -7.033147
##                                                      group          SV1
## KPP_control_rep1     0.094108912016152:-0.0382906543645605  0.094108912
## KPP_control_rep2      0.133402315644739:-0.117630544175785  0.133402316
## KPP_cachectic_rep1      0.21668690853748:0.146729360140979  0.216686909
## KPP_cachectic_rep2     0.355302897129738:0.306700418492008  0.355302897
## KPC_control_rep1      0.108359916290531:-0.114565027644468  0.108359916
## KPC_control_rep2     0.167178845254626:-0.0380267107240203  0.167178845
## KPC_cachectic_rep1  0.00995550326925079:0.0467208833882533  0.009955503
## C26_control_rep1     -0.230124798302928:0.0649040045876099 -0.230124798
## C26_control_rep2      -0.282961574795913:0.229989638111462 -0.282961575
## C26_cachectic_rep1    -0.589434990498833:0.275173714211235 -0.589434990
## C26_cachectic_rep2   -0.458284126490093:-0.306645561503745 -0.458284126
## LLC_control_rep1      0.153209746621263:0.0838660206608385  0.153209747
## LLC_control_rep2      0.0864716410217112:0.217256100722409  0.086471641
## LLC_cachectic_rep1   0.0437252518953962:-0.755813811032454  0.043725252
## LLC_cachectic_rep2 0.192403552406881:-0.000367830869761334  0.192403552
##                              SV2               name
## KPP_control_rep1   -0.0382906544   KPP_control_rep1
## KPP_control_rep2   -0.1176305442   KPP_control_rep2
## KPP_cachectic_rep1  0.1467293601 KPP_cachectic_rep1
## KPP_cachectic_rep2  0.3067004185 KPP_cachectic_rep2
## KPC_control_rep1   -0.1145650276   KPC_control_rep1
## KPC_control_rep2   -0.0380267107   KPC_control_rep2
## KPC_cachectic_rep1  0.0467208834 KPC_cachectic_rep1
## C26_control_rep1    0.0649040046   C26_control_rep1
## C26_control_rep2    0.2299896381   C26_control_rep2
## C26_cachectic_rep1  0.2751737142 C26_cachectic_rep1
## C26_cachectic_rep2 -0.3066455615 C26_cachectic_rep2
## LLC_control_rep1    0.0838660207   LLC_control_rep1
## LLC_control_rep2    0.2172561007   LLC_control_rep2
## LLC_cachectic_rep1 -0.7558138110 LLC_cachectic_rep1
## LLC_cachectic_rep2 -0.0003678309 LLC_cachectic_rep2
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=SV1)) + geom_point(size=3) +
  scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance"), limits = c(-15, 30)) +
  scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + ggtitle("PCA with SV1 highlighted")+ theme_classic() + geom_text(size= 3, data = pca_data, aes(PC1,PC2, label = name), hjust = 0.6)

boxplot(rld$SV1 ~ rld$cell_line, xlab="Cell line", ylab="SV1", ylim=c(-0.6, 0.4))

stripchart(rld$SV1 ~rld$cell_line,  
           method = "jitter",       
           pch = 19,          
           col = 4,            
           vertical = TRUE,      
           add = TRUE)

(pca_data <- plotPCA(rld, intgroup = c("SV1", "SV2"), returnData=TRUE))
## using ntop=500 top features by variance
##                           PC1        PC2
## KPP_control_rep1   -8.5659692   4.668301
## KPP_control_rep2   -8.5874817   4.199114
## KPP_cachectic_rep1 -4.1120988 -10.507347
## KPP_cachectic_rep2 -6.0756795  -8.722633
## KPC_control_rep1   -8.2169924   6.852097
## KPC_control_rep2   -8.7191768   6.346721
## KPC_cachectic_rep1 -0.8345102 -10.301588
## C26_control_rep1   13.2829266  14.087862
## C26_control_rep2   14.0758737  13.876702
## C26_cachectic_rep1 21.3412882  -4.676311
## C26_cachectic_rep2 23.0152655  -6.570683
## LLC_control_rep1   -9.3395282   6.940298
## LLC_control_rep2   -9.1146540   6.301143
## LLC_cachectic_rep1 -1.9456672 -15.460528
## LLC_cachectic_rep2 -6.2035960  -7.033147
##                                                      group          SV1
## KPP_control_rep1     0.094108912016152:-0.0382906543645605  0.094108912
## KPP_control_rep2      0.133402315644739:-0.117630544175785  0.133402316
## KPP_cachectic_rep1      0.21668690853748:0.146729360140979  0.216686909
## KPP_cachectic_rep2     0.355302897129738:0.306700418492008  0.355302897
## KPC_control_rep1      0.108359916290531:-0.114565027644468  0.108359916
## KPC_control_rep2     0.167178845254626:-0.0380267107240203  0.167178845
## KPC_cachectic_rep1  0.00995550326925079:0.0467208833882533  0.009955503
## C26_control_rep1     -0.230124798302928:0.0649040045876099 -0.230124798
## C26_control_rep2      -0.282961574795913:0.229989638111462 -0.282961575
## C26_cachectic_rep1    -0.589434990498833:0.275173714211235 -0.589434990
## C26_cachectic_rep2   -0.458284126490093:-0.306645561503745 -0.458284126
## LLC_control_rep1      0.153209746621263:0.0838660206608385  0.153209747
## LLC_control_rep2      0.0864716410217112:0.217256100722409  0.086471641
## LLC_cachectic_rep1   0.0437252518953962:-0.755813811032454  0.043725252
## LLC_cachectic_rep2 0.192403552406881:-0.000367830869761334  0.192403552
##                              SV2               name
## KPP_control_rep1   -0.0382906544   KPP_control_rep1
## KPP_control_rep2   -0.1176305442   KPP_control_rep2
## KPP_cachectic_rep1  0.1467293601 KPP_cachectic_rep1
## KPP_cachectic_rep2  0.3067004185 KPP_cachectic_rep2
## KPC_control_rep1   -0.1145650276   KPC_control_rep1
## KPC_control_rep2   -0.0380267107   KPC_control_rep2
## KPC_cachectic_rep1  0.0467208834 KPC_cachectic_rep1
## C26_control_rep1    0.0649040046   C26_control_rep1
## C26_control_rep2    0.2299896381   C26_control_rep2
## C26_cachectic_rep1  0.2751737142 C26_cachectic_rep1
## C26_cachectic_rep2 -0.3066455615 C26_cachectic_rep2
## LLC_control_rep1    0.0838660207   LLC_control_rep1
## LLC_control_rep2    0.2172561007   LLC_control_rep2
## LLC_cachectic_rep1 -0.7558138110 LLC_cachectic_rep1
## LLC_cachectic_rep2 -0.0003678309 LLC_cachectic_rep2
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=SV2)) + geom_point(size=3) +
  scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance")) +
  scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + ggtitle("PCA for whole dataset")+ theme_classic() + geom_text(size= 3, data = pca_data, aes(PC1,PC2, label = name), hjust = 0.6)

boxplot(rld$SV2 ~ rld$cell_line, xlab="Cell line", ylab="SV2", ylim=c(-0.8, 0.4))

stripchart(rld$SV2 ~rld$cell_line,  
           method = "jitter",       
           pch = 19,          
           col = 4,            
           vertical = TRUE,      
           add = TRUE)

##PCA after removingBatchEffect

assay(rld) <- limma::removeBatchEffect(assay(rld), covariates=rld$SV1)
assay(rld) <- limma::removeBatchEffect(assay(rld), covariates = rld$SV2)

set.seed(1)
(pca_data <- plotPCA(rld, intgroup = c("condition", "cell_line"), returnData=TRUE))
## using ntop=500 top features by variance
##                           PC1         PC2         group condition cell_line
## KPP_control_rep1    -6.566336   1.5310023   control:KPP   control       KPP
## KPP_control_rep2    -6.687889   2.1484997   control:KPP   control       KPP
## KPP_cachectic_rep1  13.974273   6.3902714 cachectic:KPP cachectic       KPP
## KPP_cachectic_rep2  13.417139   5.2760307 cachectic:KPP cachectic       KPP
## KPC_control_rep1    -9.799699  -3.7412844   control:KPC   control       KPC
## KPC_control_rep2    -8.575491  -6.7337349   control:KPC   control       KPC
## KPC_cachectic_rep1  10.850402 -13.4364170 cachectic:KPC cachectic       KPC
## C26_control_rep1   -11.244394   4.6297399   control:C26   control       C26
## C26_control_rep2    -9.381332   3.4055672   control:C26   control       C26
## C26_cachectic_rep1   9.660820  -2.1739595 cachectic:C26 cachectic       C26
## C26_cachectic_rep2   6.739228   2.2287239 cachectic:C26 cachectic       C26
## LLC_control_rep1    -8.378481   0.2811608   control:LLC   control       LLC
## LLC_control_rep2    -6.629013  -0.5519275   control:LLC   control       LLC
## LLC_cachectic_rep1   6.243037   2.6565163 cachectic:LLC cachectic       LLC
## LLC_cachectic_rep2   6.377735  -1.9101889 cachectic:LLC cachectic       LLC
##                                  name
## KPP_control_rep1     KPP_control_rep1
## KPP_control_rep2     KPP_control_rep2
## KPP_cachectic_rep1 KPP_cachectic_rep1
## KPP_cachectic_rep2 KPP_cachectic_rep2
## KPC_control_rep1     KPC_control_rep1
## KPC_control_rep2     KPC_control_rep2
## KPC_cachectic_rep1 KPC_cachectic_rep1
## C26_control_rep1     C26_control_rep1
## C26_control_rep2     C26_control_rep2
## C26_cachectic_rep1 C26_cachectic_rep1
## C26_cachectic_rep2 C26_cachectic_rep2
## LLC_control_rep1     LLC_control_rep1
## LLC_control_rep2     LLC_control_rep2
## LLC_cachectic_rep1 LLC_cachectic_rep1
## LLC_cachectic_rep2 LLC_cachectic_rep2
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)

ggplot(pca_data, aes(PC1, PC2, color=condition, shape=cell_line)) + geom_point(size=3) +
scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance"), limits = c(-20,20)) +
scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance"), limits = c(-15, 10)) + 
  ggtitle("PCA after correction")+ theme_classic() + geom_text(size= 3, data = pca_data, aes(PC1,PC2, label = name), hjust = 0.2)

sampleDists <- dist(t(assay(rld)))
plot(hclust(sampleDists))

Differential Expression

#filter
filter = apply(counts(ddssva, normalized=TRUE), 1, function(x){ mean(x) >= 1 })
ddssva = ddssva[filter, ]

ddssva <- estimateSizeFactors(ddssva) 
ddssva <- estimateDispersions(ddssva)
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
ddssva = nbinomLRT(ddssva, full= ~1 + cell_line + SV1 + SV2 + condition, reduced = ~ 1 + cell_line + SV1 + SV2)
results.lrt = results(ddssva)
cachectic_vs_control = results(ddssva, contrast=c("condition", "cachectic", "control"), independentFiltering = TRUE, alpha=0.1)
cachectic_vs_control <- lfcShrink(ddssva,
    coef = "condition_cachectic_vs_control", res=cachectic_vs_control, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

MA plot

plotMA(cachectic_vs_control, colSig="red")

Volcano plot

Plotting volcano plots to visualise the differential expression

#volcano plots for differences between control and cachectic 
par(mar=c(5,5,5,5), cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)

# Adjusted P values (FDR Q values)
with(cachectic_vs_control, plot(log2FoldChange, -log10(padj), pch=20, main="Volcano plot of control vs. cachectic", cex=1.0, xlab=bquote(~Log[2]~fold~change), ylab=bquote(~-log[10]~padj~value) ) )

with(subset(cachectic_vs_control, padj<0.1 & log2FoldChange> 1), points(log2FoldChange, -log10(padj), pch=20, col="red", cex=0.5))

with(subset(cachectic_vs_control, padj<0.1 & log2FoldChange< -1), points(log2FoldChange, -log10(padj), pch=20, col="blue", cex=0.5))


#Add lines for absolute FC>2 and P-value cut-off at FDR Q<0.1
abline(v=0, col="black", lty=3, lwd=1.0)
abline(v=-1, col="black", lty=4, lwd=2.0)
abline(v=1, col="black", lty=4, lwd=2.0)
abline(h=-log10(0.1), col="black", lty=4, lwd=2.0)

legend("right", legend=c("Upregulated", "Downregulated", "Not"),fill= c("red", "blue", "black"))

Clustering

So there are only two experimental conditions we care about in this analysis, however it would be good to visualise expression across the cell line, so lets plot a heatmap with two clusters (those upregulated and those downregulated)

rld <- rlog(ddssva)

assay(rld) <- limma::removeBatchEffect(assay(rld), covariates=rld$SV1)
assay(rld) <- limma::removeBatchEffect(assay(rld), covariates = rld$SV2)

significant_results=cachectic_vs_control[!is.na(cachectic_vs_control$padj) & cachectic_vs_control$padj<0.1,] ##10% are false positives. 
rld_signif = assay(rld)[rownames(significant_results),]

#how many genes are significantly different?
nrow(rld_signif) #6465
## [1] 6465
rld_z = t(apply(rld_signif, 1, function(x){ (x - mean(x)) / sd(x)}))
thr = 3  ##threshold of 3 sd away 
rld_z[rld_z > thr] = thr ## setting rld_z values > 3 as 3 
rld_z[rld_z < -thr] = -thr ## setting rld_z values <-3 as -3 

paletteLength = 20 ##making a pheatmap 
breaksList <- c(seq(-thr, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(thr/paletteLength, thr, length.out=floor(paletteLength/2)))

# sort out colour scheme
color = c(colorRampPalette(c("mediumblue", "white"))(14), colorRampPalette(c("white", "firebrick2"))(14))
## from blue-->white--> brick red 
breaksList = seq(-3, 3, length.out = 29)
paletteLength = 20 ##making a pheatmap 
breaksList <- c(seq(-thr, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(thr/paletteLength, thr, length.out=floor(paletteLength/2)))

color = c(colorRampPalette(c("mediumblue", "white"))(14), colorRampPalette(c("white", "firebrick2"))(14))
## from blue-->white--> brick red 
breaksList = seq(-3, 3, length.out = 29)
set.seed(2)
nclust = 2
results.coef.kmeans =  kmeans(rld_z, nclust, nstart=100, iter.max=50)
results.coef = rld_z[order(results.coef.kmeans$cluster, decreasing=TRUE),]
indicator = results.coef.kmeans$cluster[order(results.coef.kmeans$cluster)]  ## only want the cluster data
table(indicator)
## indicator
##    1    2 
## 3249 3216
heat.map <- pheatmap(results.coef, cluster_col=TRUE, breaks=breaksList, cluster_rows=FALSE, show_rownames=FALSE,color = color,fontsize_row = 3, legend=TRUE,border_color = NA, )

cachectic_vs_control.df = as.data.frame(cachectic_vs_control)
cachectic_vs_control.df$mgi_symbol = bm.annotations$mgi_symbol[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$gene_biotype = bm.annotations$gene_biotype[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$description = bm.annotations$description[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$entrezgene_id = bm.annotations$entrezgene_id[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$cluster = NA
cachectic_vs_control.df[names(results.coef.kmeans$cluster),]$cluster = ifelse(results.coef.kmeans$cluster==2, "UP", "DOWN")

write.csv(cachectic_vs_control.df, "./results/talbert_mouse_df.csv")

Functional enrichments

Annotating the clusters with the terms/processes/pathways that they are enriched with. Do these annotations make sense given the patterns in the clusters? Remember that clustering ordering changes between runs so make sure to remember to saveRDS and readRDS.

CLUSTER 1

c1 = row.names(cachectic_vs_control.df[!is.na(cachectic_vs_control.df$cluster) & cachectic_vs_control.df$cluster=="UP",])

heat.map.c1 <- pheatmap(results.coef[c1,], cluster_col=TRUE, breaks=breaksList, cluster_rows=FALSE, show_rownames=FALSE,color = color,fontsize_row = 3, legend=TRUE,border_color = NA, main = "Cluster 1")

ego.BP_c1 <- enrichGO(gene    = c1,
                universe      = rownames(dds), 
                OrgDb         = org.Mm.eg.db,  
                keyType       = 'ENSEMBL',
                ont           = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
                readable      = TRUE)

write.csv(as.data.frame(ego.BP_c1), "./results/go_bp_up.csv")
saveRDS(ego.BP_c1, "rds/ego.BP_up.rds")
dotplot(ego.BP_c1, title="GO:BP Cluster 1")

ego.MF_c1 <- enrichGO(gene          = c1,
                universe      = rownames(dds), 
                OrgDb         = org.Mm.eg.db, 
                keyType       = 'ENSEMBL', 
                ont           = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
        readable      = TRUE)

write.csv(as.data.frame(ego.MF_c1), "./results/go_mf_up.csv")
saveRDS(ego.MF_c1, "rds/ego.MF_up.rds")
dotplot(ego.MF_c1, title="GO:MF Cluster 1")

ego.CC_c1 <- enrichGO(gene          = c1,
                universe      = rownames(dds),
                OrgDb         = org.Mm.eg.db,  
                keyType       = 'ENSEMBL',  
                ont           = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
        readable      = TRUE)

write.csv(as.data.frame(ego.CC_c1), "./results/go_cc_up.csv")
saveRDS(ego.CC_c1, "rds/ego.CC_up.rds")
dotplot(ego.CC_c1, title="GO:CC Cluster 1")

KEGG Analysis

kegg_c1 <- enrichKEGG( gene= as.character(ensembl.genes[c1,]$entrezgene_id),
                      universe = as.character(ensembl.genes[rownames(dds),]$entrezgene_id), 
                      organism = 'mmu' ,
                      pvalueCutoff = 1,
                      qvalueCutoff = 1)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/mmu/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/mmu"...
write.csv(as.data.frame(kegg_c1), "./results/kegg_up.csv")
saveRDS(kegg_c1, "rds/KEGG_up.rds")
dotplot(kegg_c1, title= "KEGG Cluster 1")

CLUSTER 2

c2 = row.names(cachectic_vs_control.df[!is.na(cachectic_vs_control.df$cluster) & cachectic_vs_control.df$cluster=="DOWN",])


heat.map.c2 <- pheatmap(results.coef[c2,], cluster_col=TRUE, breaks=breaksList, cluster_rows=FALSE, show_rownames=FALSE,color = color,fontsize_row = 3, legend=TRUE,border_color = NA, main = "Cluster 2")

ego.BP_c2 <- enrichGO(gene    = c2,
                universe      = rownames(dds), 
                OrgDb         = org.Mm.eg.db,  
                keyType       = 'ENSEMBL',  
                ont           = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
        readable      = TRUE)

write.csv(as.data.frame(ego.BP_c2), "./results/go_bp_dn.csv")
saveRDS(ego.BP_c2, "rds/ego.BP_dn.rds")
dotplot(ego.BP_c2, title="GO:BP Cluster 2")

ego.MF_c2 <- enrichGO(gene    = c2,
                universe      = rownames(dds), 
                OrgDb         = org.Mm.eg.db,  
                keyType       = 'ENSEMBL',  
                ont           = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
        readable      = TRUE)

write.csv(as.data.frame(ego.MF_c2), "./results/go_mf_dn.csv")
saveRDS(ego.MF_c2, "rds/ego.MF_dn.rds")
dotplot(ego.MF_c2, title="GO:MF Cluster 2")

ego.CC_c2 <- enrichGO(gene    = c2,
                universe      = rownames(dds), 
                OrgDb         = org.Mm.eg.db, 
                keyType       = 'ENSEMBL',  
                ont           = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff  = 1,
                qvalueCutoff  = 1,
                readable      = TRUE)

saveRDS(ego.CC_c2, "rds/ego.CC_dn.rds")
write.csv(as.data.frame(ego.CC_c2), "./results/go_cc_dn.csv")
dotplot(ego.CC_c2, title="GO:CC Cluster 2")

kegg_c2 <- enrichKEGG( gene= as.character(ensembl.genes[c2,]$entrezgene_id),
                      universe = as.character(ensembl.genes[rownames(dds),]$entrezgene_id),
                      organism = 'mmu' ,
                      pvalueCutoff=1,
                      qvalueCutoff =1)

write.csv(as.data.frame(kegg_c2), "./results/kegg_dn.csv")
saveRDS(kegg_c2, "rds/KEGG_dn.rds")
dotplot(kegg_c2, title= "KEGG Cluster 2")

Functional enrichment plots

test <- simplify(ego.BP_c1, cutoff=0.7, by="qvalue", select_fun= min, measure="Wang")
library(stringr)
up.terms = rbind(
  ego.BP_c1[ego.BP_c1$Description %in% c("mRNA processing", "ribosome biogenesis", "regulation of apoptotic signaling pathway", "muscle cell proliferation"),
            c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")], 
  ego.MF_c1[ego.MF_c1$Description %in% c("RNA polymerase II-specific DNA-binding transcription factor binding", "structural constituent of ribosome"),
            c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")],
  kegg_c1[kegg_c1$Description %in% c("HIF-1 signaling pathway - Mus musculus (house mouse)", "Mitophagy - animal - Mus musculus (house mouse)", "Notch signaling pathway - Mus musculus (house mouse)"),
          c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")]
  )

up.terms$ontology = c(rep("GO:BP", 4), rep("GO:MF", 2), rep("KEGG", 3))
up.terms$count = sapply(str_split(up.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])})
up.terms$gr = sapply(str_split(up.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})

up.terms$bgr = sapply(str_split(up.terms$BgRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})
up.terms$bg = sapply(str_split(up.terms$BgRatio, "/"), function(x){ as.numeric(x[1])})
up.terms$gb = up.terms$count / up.terms$bg
up.terms$category = "UP"

down.terms = rbind(
  ego.BP_c2[ego.BP_c2$Description %in% c("muscle cell differentiation", "collagen fibril organization", "tricarboxylic acid cycle", "response to oxygen levels"),
            c("ID", "Description", "GeneRatio", "pvalue", "p.adjust", "qvalue", "BgRatio")],
  ego.CC_c2[ego.CC_c2$Description %in% c("myofibril", "respirasome"),
            c("ID", "Description", "BgRatio", "GeneRatio", "pvalue", "p.adjust", "qvalue")],
  kegg_c2[kegg_c2$Description %in% c("Oxidative phosphorylation - Mus musculus (house mouse)", "Cardiac muscle contraction - Mus musculus (house mouse)", "Focal adhesion - Mus musculus (house mouse)"),
          c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")]
  )

down.terms$ontology = c(rep("GO:BP", 4), rep("GO:CC", 2), rep("KEGG", 3))
down.terms$count = sapply(str_split(down.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])})
down.terms$gr = sapply(str_split(down.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})
down.terms$bgr = sapply(str_split(down.terms$BgRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})
down.terms$bg = sapply(str_split(down.terms$BgRatio, "/"), function(x){ as.numeric(x[1])})
down.terms$gb = down.terms$count / down.terms$bg
down.terms$category = "DOWN"

terms.to.plot = rbind(up.terms, down.terms)

terms.to.plot$Description = factor(terms.to.plot$Description,
                                   rev(c("mRNA processing", 
                                         "ribosome biogenesis", 
                                         "regulation of apoptotic signaling pathway", 
                                         "muscle cell proliferation", 
                                         "RNA polymerase II-specific DNA-binding transcription factor binding", 
                                         "structural constituent of ribosome", 
                                         "HIF-1 signaling pathway - Mus musculus (house mouse)", 
                                         "Mitophagy - animal - Mus musculus (house mouse)", 
                                         "Notch signaling pathway - Mus musculus (house mouse)",
                                         "muscle cell differentiation", 
                                         "collagen fibril organization", 
                                         "tricarboxylic acid cycle", 
                                         "response to oxygen levels", 
                                         "myofibril", "respirasome", 
                                         "Oxidative phosphorylation - Mus musculus (house mouse)", 
                                         "Cardiac muscle contraction - Mus musculus (house mouse)", 
                                         "Focal adhesion - Mus musculus (house mouse)")))

ggplot(terms.to.plot, aes(x=gb, y=Description, size=count, colour=qvalue)) + geom_point() + scale_x_continuous("Fraction of annotated genes", limits=c(0, 1)) + theme_bw() + scale_color_continuous(limits=c(0, 0.05), high="blue", low="red") + facet_grid(~category)

  ggplot(terms.to.plot, aes(x=gr, y=Description, size=count, colour=qvalue)) + geom_point() + scale_x_continuous("Gene Ratio", limits=c(0, 0.1)) + theme_bw() + scale_color_continuous(limits=c(0, 0.05), high="blue", low="red") + facet_grid(~category)

Mef2c gene expression plot

genes_plot <- data.frame(
  model = ddssva$cell_line,
  condition = ddssva$condition,
  expression = counts(ddssva, normalized = TRUE)[c("ENSMUSG00000005583"), ])


genes_plot %>%
  ggplot(aes(x = condition, y = expression, group = condition, fill=condition)) + 
  geom_boxplot() +
  #geom_text(aes(label=round(value,2)), vjust=-0.3, size=3.5) + 
  xlab("Samples") +
  ylab("Normalised Counts") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  geom_point(data=genes_plot, aes(x=condition, y=expression, group=condition), size=2) + scale_y_continuous(limits=c(5000, 25000)) + facet_grid(~model)

saveRDS(ddssva,  "rds/ddssva.rds")
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/London
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] stringr_1.5.1               sva_3.50.0                 
##  [3] BiocParallel_1.36.0         mgcv_1.9-1                 
##  [5] nlme_3.1-164                RColorBrewer_1.1-3         
##  [7] clusterProfiler_4.10.0      org.Mm.eg.db_3.18.0        
##  [9] pheatmap_1.0.12             kableExtra_1.4.0           
## [11] Biostrings_2.70.2           XVector_0.42.0             
## [13] scales_1.3.0                reshape2_1.4.4             
## [15] knitr_1.45                  biomaRt_2.58.2             
## [17] GenomicFeatures_1.54.3      AnnotationDbi_1.64.1       
## [19] genefilter_1.84.0           ggplot2_3.5.0              
## [21] DESeq2_1.42.0               SummarizedExperiment_1.32.0
## [23] Biobase_2.62.0              MatrixGenerics_1.14.0      
## [25] matrixStats_1.2.0           GenomicRanges_1.54.1       
## [27] GenomeInfoDb_1.38.6         IRanges_2.36.0             
## [29] S4Vectors_0.40.2            BiocGenerics_0.48.1        
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.1            BiocIO_1.12.0            bitops_1.0-7            
##   [4] ggplotify_0.1.2          filelock_1.0.3           tibble_3.2.1            
##   [7] polyclip_1.10-6          XML_3.99-0.16.1          lifecycle_1.0.4         
##  [10] mixsqp_0.3-54            edgeR_4.0.16             lattice_0.22-5          
##  [13] MASS_7.3-60.0.1          magrittr_2.0.3           limma_3.58.1            
##  [16] sass_0.4.8               rmarkdown_2.25           jquerylib_0.1.4         
##  [19] yaml_2.3.8               cowplot_1.1.3            DBI_1.2.2               
##  [22] abind_1.4-5              zlibbioc_1.48.0          purrr_1.0.2             
##  [25] ggraph_2.2.0             RCurl_1.98-1.14          yulab.utils_0.1.4       
##  [28] tweenr_2.0.3             rappdirs_0.3.3           GenomeInfoDbData_1.2.11 
##  [31] enrichplot_1.22.0        ggrepel_0.9.5            irlba_2.3.5.1           
##  [34] tidytree_0.4.6           annotate_1.80.0          svglite_2.1.3           
##  [37] codetools_0.2-19         DelayedArray_0.28.0      DOSE_3.28.2             
##  [40] xml2_1.3.6               ggforce_0.4.2            tidyselect_1.2.1        
##  [43] aplot_0.2.2              farver_2.1.1             viridis_0.6.5           
##  [46] BiocFileCache_2.10.1     GenomicAlignments_1.38.2 jsonlite_1.8.8          
##  [49] tidygraph_1.3.1          survival_3.5-8           systemfonts_1.0.5       
##  [52] tools_4.3.1              progress_1.2.3           treeio_1.26.0           
##  [55] Rcpp_1.0.12              glue_1.7.0               gridExtra_2.3           
##  [58] SparseArray_1.2.4        xfun_0.42                qvalue_2.34.0           
##  [61] dplyr_1.1.4              withr_3.0.0              fastmap_1.1.1           
##  [64] fansi_1.0.6              truncnorm_1.0-9          digest_0.6.35           
##  [67] R6_2.5.1                 gridGraphics_0.5-1       colorspace_2.1-0        
##  [70] GO.db_3.18.0             RSQLite_2.3.5            utf8_1.2.4              
##  [73] tidyr_1.3.1              generics_0.1.3           data.table_1.15.0       
##  [76] rtracklayer_1.62.0       prettyunits_1.2.0        graphlayouts_1.1.0      
##  [79] httr_1.4.7               S4Arrays_1.2.0           scatterpie_0.2.1        
##  [82] pkgconfig_2.0.3          gtable_0.3.4             blob_1.2.4              
##  [85] shadowtext_0.1.3         htmltools_0.5.7          fgsea_1.28.0            
##  [88] png_0.1-8                ashr_2.2-63              ggfun_0.1.4             
##  [91] rstudioapi_0.15.0        rjson_0.2.21             curl_5.2.0              
##  [94] cachem_1.0.8             parallel_4.3.1           HDO.db_0.99.1           
##  [97] restfulr_0.0.15          pillar_1.9.0             grid_4.3.1              
## [100] vctrs_0.6.5              dbplyr_2.4.0             xtable_1.8-4            
## [103] evaluate_0.23            invgamma_1.1             cli_3.6.2               
## [106] locfit_1.5-9.8           compiler_4.3.1           Rsamtools_2.18.0        
## [109] rlang_1.1.3              crayon_1.5.2             SQUAREM_2021.1          
## [112] labeling_0.4.3           plyr_1.8.9               fs_1.6.3                
## [115] stringi_1.8.3            viridisLite_0.4.2        munsell_0.5.1           
## [118] lazyeval_0.2.2           GOSemSim_2.28.1          Matrix_1.6-5            
## [121] hms_1.1.3                patchwork_1.2.0          bit64_4.0.5             
## [124] statmod_1.5.0            KEGGREST_1.42.0          highr_0.10              
## [127] igraph_2.0.2             memoise_2.0.1            bslib_0.6.1             
## [130] ggtree_3.10.1            fastmatch_1.1-4          bit_4.0.5               
## [133] ape_5.7-1                gson_0.1.0